Set directory path and load libraries
# Set directory path
dir_path <- "~/Google Drive File Stream/My Drive/Manuscript - Zebrafish RGC Atlas/Analysis/Analysis2020/"
setwd(dir_path)
# Load libraries
library(Seurat)
library(dplyr)
library(reshape2)
library(ggplot2)
library(cowplot)
source("utils/utilFxns.R")
source("utils/plottingFxns.R")
Create the initial Seurat object by loading the Count matrix and set batch information
# Load adult zebrafish count matrix
Count.mat <- readRDS("../CountMatrices/ConsolidatedCounts_Zfish_013020.rds")
# Remove sample ZfishRGC16 due to wetting sample failure
cells.remove = grep("ZfishRGC16_",colnames(Count.mat), value=TRUE)
Count.mat = Count.mat[, setdiff(colnames(Count.mat), cells.remove)]
# Create Seurat object, removing genes that are expressed in fewer than 25 cells and removing cells
# that have fewer than 450 features
adult <- CreateSeuratObject(counts = Count.mat, project = "zfishRGC", min.cells = 25, min.features = 450)
Feature names cannot have underscores ('_'), replacing with dashes ('-')
# Check QC metrics, including RNA counts and mitochondrial scores
adult[["percent.mt"]] <- PercentageFeatureSet(adult, pattern = "^MT-")
adult[["percent.rps"]] <- PercentageFeatureSet(adult, pattern = "^RPS")
adult[["percent.rpl"]] <- PercentageFeatureSet(adult, pattern = "^RPL")
adult[["percent.rp"]] <- adult[["percent.rps"]] + adult[["percent.rpl"]]
VlnPlot(adult, features = c("nCount_RNA", "percent.mt", "percent.rp"))
# Change the order of factor
adult@meta.data$orig.ident = factor(adult@meta.data$orig.ident, levels = paste0("ZfishRGC",c(1:15)))
# Set batch information in meta.data
batchname = as.character(adult@meta.data$orig.ident)
batchid = rep("Batch0", length(batchname))
batchid[grep("ZfishRGC1$", batchname)] = "Batch1"
batchid[grep("ZfishRGC2", batchname)] = "Batch1"
batchid[grep("ZfishRGC3", batchname)] = "Batch2"
batchid[grep("ZfishRGC4", batchname)] = "Batch2"
batchid[grep("ZfishRGC5", batchname)] = "Batch3"
batchid[grep("ZfishRGC6", batchname)] = "Batch3"
batchid[grep("ZfishRGC7", batchname)] = "Batch3"
batchid[grep("ZfishRGC8", batchname)] = "Batch3"
batchid[grep("ZfishRGC9", batchname)] = "Batch4"
batchid[grep("ZfishRGC10", batchname)] = "Batch4"
batchid[grep("ZfishRGC11", batchname)] = "Batch5"
batchid[grep("ZfishRGC12", batchname)] = "Batch5"
batchid[grep("ZfishRGC13", batchname)] = "Batch5"
batchid[grep("ZfishRGC14", batchname)] = "Batch5"
batchid[grep("ZfishRGC15", batchname)] = "Batch5"
adult@meta.data$batch = factor(batchid)
table(adult@meta.data$orig.ident, adult@meta.data$batch)
Batch1 Batch2 Batch3 Batch4 Batch5
ZfishRGC1 3094 0 0 0 0
ZfishRGC2 3235 0 0 0 0
ZfishRGC3 0 1628 0 0 0
ZfishRGC4 0 1793 0 0 0
ZfishRGC5 0 0 4481 0 0
ZfishRGC6 0 0 4026 0 0
ZfishRGC7 0 0 3393 0 0
ZfishRGC8 0 0 2339 0 0
ZfishRGC9 0 0 0 2001 0
ZfishRGC10 0 0 0 1935 0
ZfishRGC11 0 0 0 0 4131
ZfishRGC12 0 0 0 0 4029
ZfishRGC13 0 0 0 0 4114
ZfishRGC14 0 0 0 0 4306
ZfishRGC15 0 0 0 0 4083
Cluster the data using functionalities within Seurat.
# Log normalize the data, identify the top 2000 variable features, and scale all genes
adult <- NormalizeData(adult, normalization.method = "LogNormalize", scale.factor = 10000)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
adult <- FindVariableFeatures(adult, selection.method = "vst", nfeatures = 2000)
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
all.genes <- rownames(adult)
adult <- ScaleData(adult, features = all.genes)
Centering and scaling data matrix
|
| | 0%
|
|====== | 6%
|
|============ | 12%
|
|=================== | 18%
|
|========================= | 24%
|
|=============================== | 29%
|
|===================================== | 35%
|
|============================================ | 41%
|
|================================================== | 47%
|
|======================================================== | 53%
|
|============================================================== | 59%
|
|===================================================================== | 65%
|
|=========================================================================== | 71%
|
|================================================================================= | 76%
|
|======================================================================================= | 82%
|
|============================================================================================== | 88%
|
|==================================================================================================== | 94%
|
|==========================================================================================================| 100%
# Run PCA on variable features and visualize the dimensionality of the dataset using an elbow plot
adult <- RunPCA(adult, features = VariableFeatures(object = adult))
PC_ 1
Positive: CALB2A, ZGC:65851, ATP1A3A, ELAVL4, ATP2B3B, SI:DKEY-33C12.3, DNAJB5, NAT8L, SCG2B, RGS16
OLFM1B, RTN4RL2A, ATP1A3B, SCRT1A, PCP4A, TUBB2, ATP1B1B, ADCYAP1B, CACNA1AA, EGR4
NPAS4A, CLSTN3, ANXA13L, POU4F1, NEFMA, EDIL3, COX6B1, ELMOD1, IGFBP5B, KCNIP1B
Negative: AQP1A.1, SLC1A2B, ZFP36L1B, FABP7A, SDC4, S1PR1, APOEB, MLC1, HSD11B2, TMEM176L.1
ATP1A1B, HEPACAM, ZGC:112332, GLULB, PTGDS, RPZ5, PTGDSB, SLC43A2B, RHBG, TGFB3
HER6, ZGC:195173, RLBP1A, SPARC, SDPRA, CX43, GLULA, MDKA, MSI1, COL15A1B
PC_ 2
Positive: ZGC:109965, ARL13A, NDRG1A, SI:CH211-251B21.1, TMEM244, PDE6G, PPDPFA, SAGA, CNGA1, SAGB
FSCN2B, CRX, KCNV2A, NR2E3, ARL3L2, SI:CH211-113D22.2, MEIG1, GNGT2A, ATP1B2B, TULP2
PDCA, ELOVL4B, TULP1B, ROM1B, GUCA1B, GUK1B, RBP4L, GNAT1, TULP1A, SAMD7
Negative: SLC1A2B, AQP1A.1, S1PR1, MLC1, ZFP36L1B, PTGDS, ATP1A1B, PTGDSB, SLC43A2B, APOEB
CX43, HEPACAM, RPZ5, ATP1B4, ZGC:195173, TGFB3, IGFBP1A, RHBG, CAHZ, SPARC
ZGC:112332, SI:CH211-152C2.3, TMEM176L.1, SDPRA, MFAP5, BGNB, GULP1A, HER6, COL15A1B, HYAL6
PC_ 3
Positive: CNGA1, SAGB, SAGA, ROM1B, SI:CH211-113D22.2, GNAT1, PDCA, GUCA1B, SYT5A, GNGT1
PPDPFA, NDRG1A, RBP4L, CRX, RS1A, ELOVL4B, EFNA1B, CABP5A, PDE6H, ATP1B2B
CABP2A, PDE6G, TULP1B, ARL13A, ARL3L2, LRIT1A, RHO, NEUROD4, ZGC:109965, KCNV2A
Negative: RPLP0, RPL7, RPL37A, RPL36, SI:DKEY-151G10.6, RPS15A, RPL32, RPS7, RPL35A, RPS19
RPL37, RPLP1, RPL11, FAUA, RPS3A, RPS18, RPL34, RPS15, RPL19, RPS12
RPL21, RPS21, RPL26, RPS2, RPL10, RPL23, RPS17, RPS14, RPL35, RPS5
PC_ 4
Positive: SEPP1A, FADS2, PCDH7B, SI:CH211-152C2.3, ATP1A3B, MDKA, GRIA3A, CAHZ, ATP2B3B, RLBP1A
ICN, PBXIP1B, FXYD6L, LEO1, MT2, S100A10B, ZFHX3, EPAS1B, MEF2CB, CLSTN1
SI:CH211-286B5.5, GYG1B, EEF1A2, PLECA, ATP1A3A, ATP6V1E1A, SOX13, CLSTN3, CLSTN2, DAP1B
Negative: HSP70.3, UBB, HSP90AA1.2, HSP70.2, JUN, HSP70L, DNAJB1B, HSP70.1, ATF3, MCL1A
PPP1R15A, HSPA5, CBX7A, ACTB1, JUNBA, PDCD4B, LAPTM4B, MKNK2B, JUNBB, HSPA4A
GAP43, NFIL3-6, SI:CH211-195B15.8, NOCTA, PTMAB, GADD45BA, JDP2B, TGIF1, ZFAND2A, CXCL14
PC_ 5
Positive: CABP5A, RS1A, CABP2A, LRIT1A, EFNA1B, GNB3A, VSX1, SYT5A, NME2A, EFNA1A
GABRR2A, PPP1R1B, TMSB, IRBPL, BHLHE23, SNX8B, IGSF21B, CPLX4A, NEUROD4, SI:CH1073-83N3.2
LRIT1B, TUBB5, ARHGEF25B, SI:CH211-285J22.3, EML1, SV2BA, OTX1B, ALCAMB, ABCC8B, PPP1R3CB
Negative: HSP90AA1.2, HSP70.2, HSP70.1, CALB2A, HSP70.3, UBB, DNAJB1B, HSP70L, JUN, GADD45BA
MCL1A, JDP2B, RTN4RL2A, NPAS4A, ATF3, EGR4, PPP1R15A, NRGNA, DNAJB5, JUNBB
SI:CH211-113D22.2, GUCA1B, PDCA, CABZ01080568.1, SAGA, SAGB, NFIL3-6, CNGA1, ROM1B, ELOVL4B
DimPlot(adult, reduction = "pca")
ElbowPlot(adult, ndims = 50)
# Compute clusters using 30 PCs and visualize using UMAP and tSNE
adult <- FindNeighbors(adult, dims = 1:30)
Computing nearest neighbor graph
Computing SNN
adult <- FindClusters(adult, resolution = .5)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 48588
Number of edges: 1710634
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9356
Number of communities: 25
Elapsed time: 7 seconds
adult <- RunTSNE(adult, dims = 1:30)
adult <- RunUMAP(adult, dims = 1:30)
11:53:46 UMAP embedding parameters a = 0.9922 b = 1.112
11:53:46 Read 48588 rows and found 30 numeric columns
11:53:46 Using Annoy for neighbor search, n_neighbors = 30
11:53:46 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
11:53:57 Writing NN index file to temp file /var/folders/lg/jzd8kwv13k18nl7j7q_drbkw0000gn/T//RtmpV4FyTF/file139747f5c3ff
11:53:57 Searching Annoy index using 1 thread, search_k = 3000
11:54:11 Annoy recall = 100%
11:54:13 Commencing smooth kNN distance calibration using 1 thread
11:54:16 Initializing from normalized Laplacian + noise
11:54:19 Commencing optimization for 200 epochs, with 2170330 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
11:54:45 Optimization finished
DimPlot(adult, reduction = "tsne", group.by = "batch")
Perform data integration to correct for batch effects. To save memory, this block is not run and the integrated dataset is loaded after.
# Split object by batch number
adult.list <- SplitObject(adult, split.by = "batch")
# Normalize each dataset and find variable features
for (i in 1:length(adult.list)) {
adult.list[[i]] <- NormalizeData(adult.list[[i]], verbose = FALSE)
adult.list[[i]] <- FindVariableFeatures(adult.list[[i]], selection.method = "vst",
nfeatures = 1500, verbose = FALSE)
}
# Find Integration anchors
adult.anchors <- FindIntegrationAnchors(object.list = adult.list, dims = 1:40)
# Integrate Data and save the object
adult.integrated <- IntegrateData(anchorset = adult.anchors, dims = 1:40)
saveRDS(adult.integrated,"../Objects2020/zFish_SeuratClusteredIntegrated_02042020.rds")
Run dimensionality reduction and clustering on the integrated data
# Switch to integrated assay. The variable features of this assay are automatically
# set during IntegrateData
DefaultAssay(adult.integrated) <- "integrated"
# Run the standard workflow for visualization and clustering
adult.integrated <- ScaleData(adult.integrated, verbose = FALSE)
adult.integrated <- RunPCA(adult.integrated, npcs = 40, verbose = FALSE)
adult.integrated <- RunUMAP(adult.integrated, reduction = "pca", dims = 1:40)
13:06:08 UMAP embedding parameters a = 0.9922 b = 1.112
13:06:08 Read 48588 rows and found 40 numeric columns
13:06:08 Using Annoy for neighbor search, n_neighbors = 30
13:06:08 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
13:06:16 Writing NN index file to temp file /var/folders/lg/jzd8kwv13k18nl7j7q_drbkw0000gn/T//RtmpV4FyTF/file139747bea52d8
13:06:16 Searching Annoy index using 1 thread, search_k = 3000
13:06:31 Annoy recall = 100%
13:06:32 Commencing smooth kNN distance calibration using 1 thread
13:06:34 Initializing from normalized Laplacian + noise
13:06:37 Commencing optimization for 200 epochs, with 2337680 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
13:07:11 Optimization finished
adult.integrated <- RunTSNE(adult.integrated, reduction = "pca", dims = 1:40)
adult.integrated <- FindNeighbors(adult.integrated, dims = 1:40)
Computing nearest neighbor graph
Computing SNN
adult.integrated <- FindClusters(adult.integrated) #Run Louvain algorithm
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 48588
Number of edges: 2911997
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9496
Number of communities: 43
Elapsed time: 9 seconds
# Check out UMAP (integrated vs. non-integrated)
p1 <- DimPlot(adult.integrated, reduction = "umap", group.by = "batch")
p2 <- DimPlot(adult, reduction = "umap", group.by = "batch")
plot_grid(p1, p2, labels = c("Integrated", "Non-integrated"))
Investigate cluster markers and remove contaminant cell types
Since a large number of cells were removed, repeat integration and clustering steps.
# Split by batch
adult.list <- SplitObject(adult, split.by = "batch")
# Normalize datasets and find variable features
for (i in 1:length(adult.list)) {
print(i)
adult.list[[i]] <- NormalizeData(adult.list[[i]], verbose = FALSE)
adult.list[[i]] <- FindVariableFeatures(adult.list[[i]], selection.method = "vst",
nfeatures = 2000, verbose = FALSE)
}
# Find Integration anchors and integrate
adult.anchors <- FindIntegrationAnchors(object.list = adult.list, dims = 1:40)
adult.integrated <- IntegrateData(anchorset = adult.anchors, dims = 1:40)
DefaultAssay(adult.integrated) <- "integrated"
# Run the standard workflow for visualization and clustering using the integrated data
adult.integrated <- ScaleData(adult.integrated, verbose = FALSE)
adult.integrated <- RunPCA(adult.integrated, npcs = 40, verbose = FALSE)
adult.integrated <- RunUMAP(adult.integrated, reduction = "pca", dims = 1:40)
adult.integrated <- RunTSNE(adult.integrated, reduction = "pca", dims = 1:40)
adult.integrated <- FindNeighbors(adult.integrated, dims = 1:40)
adult.integrated <- FindClusters(adult.integrated) #Run Louvain algorithm
# Save
saveRDS(adult.integrated,"../Objects2020/zFish_SeuratClusteredIntegratedWithPruning_02042020.rds")
Investigate variable features and remove any remanining contaminants. Set final cluster assignments.
adult.markers %>% group_by(cluster) %>% top_n(n = 5, wt = avg_logFC)
# Remove some remaining contaminant clusters
# Cluster 10, 20, 22 due to lack of unique differentially expressed genes
# Cluster 35: Muller Glia (RLBP1A, APOEB)
VlnPlot(adult.integrated, "GRIA3A", pt.size = 0)
VlnPlot(adult.integrated, "KCNIP3B", pt.size = 0)
head(subset(adult.markers, cluster == 20))
VlnPlot(adult.integrated, "CALB2A", pt.size = 0)
VlnPlot(adult.integrated, "PPP1R1C", pt.size = 0)
head(subset(adult.markers, cluster == 22))
VlnPlot(adult.integrated, "GABRA5", pt.size = 0)
VlnPlot(adult.integrated, "APOEB", pt.size = 0)
VlnPlot(adult.integrated, "RLBP1A", pt.size = 0)
# Remove contaminant cell types
Idents(adult.integrated) <- "integrated_snn_res.0.8"
idents.remove <- c(10, 20, 22, 35)
adult.integrated = subset(adult.integrated, idents = setdiff(levels(Idents(adult.integrated)), idents.remove))
# Refactor final cluster assignments as clusterID
adult.integrated@meta.data$clusterID = droplevels(Idents(adult.integrated))
levels(adult.integrated@meta.data$clusterID) = 1:length(levels(adult.integrated@meta.data$clusterID))
Idents(adult.integrated) <- "clusterID"
Construct dendrogram based on hierarchical clustering and reorder clusters
# Build dendrogram
adult.integrated <- FindVariableFeatures(adult.integrated, selection.method = "vst", nfeatures = 500)
selection.method set to 'vst' but count slot is empty; will use data slot insteadCalculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
NaNs producednumber of items to replace is not a multiple of replacement lengthCalculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
adult.integrated <- BuildClusterTree(adult.integrated)
Finished averaging integrated for cluster 1
Finished averaging integrated for cluster 2
Finished averaging integrated for cluster 3
Finished averaging integrated for cluster 4
Finished averaging integrated for cluster 5
Finished averaging integrated for cluster 6
Finished averaging integrated for cluster 7
Finished averaging integrated for cluster 8
Finished averaging integrated for cluster 9
Finished averaging integrated for cluster 10
Finished averaging integrated for cluster 11
Finished averaging integrated for cluster 12
Finished averaging integrated for cluster 13
Finished averaging integrated for cluster 14
Finished averaging integrated for cluster 15
Finished averaging integrated for cluster 16
Finished averaging integrated for cluster 17
Finished averaging integrated for cluster 18
Finished averaging integrated for cluster 19
Finished averaging integrated for cluster 20
Finished averaging integrated for cluster 21
Finished averaging integrated for cluster 22
Finished averaging integrated for cluster 23
Finished averaging integrated for cluster 24
Finished averaging integrated for cluster 25
Finished averaging integrated for cluster 26
Finished averaging integrated for cluster 27
Finished averaging integrated for cluster 28
Finished averaging integrated for cluster 29
Finished averaging integrated for cluster 30
Finished averaging integrated for cluster 31
Finished averaging integrated for cluster 32
Finished averaging integrated for cluster 33
PlotClusterTree(adult.integrated)
plot(adult.integrated@tools$BuildClusterTree)
# Reorder clusters according to dendrogram for dotplot plotting
tree_obj = adult.integrated@tools$BuildClusterTree
left_clusts = Seurat:::GetLeftDescendants(tree_obj, length(levels(adult.integrated@meta.data$clusterID))+1)
right_clusts = Seurat:::GetRightDescendants(tree_obj, length(levels(adult.integrated@meta.data$clusterID))+1)
clust_order = c(left_clusts, right_clusts)
adult.integrated@meta.data$dendro_order = factor(adult.integrated@meta.data$clusterID, levels = clust_order)
# Save final adult object
saveRDS(adult.integrated, "../Objects2020/adult_zFish_object.rds")
Identify all variable features
adult.markers <- FindAllMarkers(adult, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
saveRDS(adult.markers, "../Objects2020/adult_ClusterMarkers")
Create plots for figures
# Read in object, set order based on dendrogram, and convert genes to lower case
adult <- readRDS("../Objects2020/adult_zFish_object.rds")
Idents(adult) <- "dendro_order"
adult <- LowerCase_genes(adult)
# Plot the top 3 DE genes in each cluster
mapGenes_vector <- vector()
for(i in levels(Idents(adult))){
mapGenes_vector = union(mapGenes_vector, head(subset(adult.markers, cluster==i))[1:3,]$gene)
}
mapGenes_vector <- tolower(mapGenes_vector[2:length(mapGenes_vector)])
DotPlot(adult, features = mapGenes_vector, assay = "RNA") + RotatedAxis()
# Plot canonical RGC markers
DotPlot(adult, features = c("robo2", "isl2b", "rbpms2b"), scale.min = 0, assay = "RNA") + RotatedAxis()
# Visualize clusters on tSNE and UMAP
DimPlot(adult, reduction = "tsne", label = TRUE, group.by = "clusterID")
DimPlot(adult, reduction = "umap", label = TRUE, group.by = "clusterID")
# Plot relative size of each cluster
PlotClusterSize(adult)